************************
* The limits of ‘Western’ supply chain sustainability governance to halt deforestation
* Lead Author: Christoph Kubitza
* Email: Christoph.Kubitza@giga-hamburg.de
* Date: 03.7.2025


*Import data
*https://www.reddit.com/r/stata/comments/uwku4x/how_do_i_merge_627_excel_files_in_stata/
cd "C:\Users\christoph.kubitza\OneDrive - GIGA\Working Paper\CK Oil palm investors\Data\01_raw\Palm Watch\raw"
global GFW "C:\Users\christoph.kubitza\OneDrive - GIGA\Working Paper\CK Oil palm investors\Data\01_raw\IND Mills\UML\GFW"
global PW "C:\Users\christoph.kubitza\OneDrive - GIGA\Working Paper\CK Oil palm investors\Data\01_raw\Palm Watch"
global UML "C:\Users\christoph.kubitza\OneDrive - GIGA\Working Paper\CK Oil palm investors\Data\01_raw\IND Mills\UML"

local folder: dir "./" files "*.csv"
display `folder'

clear
tempfile new_data
frame create import
foreach file in `folder'{
  frame import{
    import delimited using "`file'", clear
    tostring years, replace
    save "`new_data'", replace
  }
  append using "`new_data'"
}

save "$PW\data\mills_pw.dta", replace


*Make a list of mills with data on rspo and number of supply chain links to MNCs
use "$PW\data\mills_pw.dta", clear
drop if km_0==.
split years, p(,) gen(y_)

gen rspo_mill=1 if rspostatus=="RSPO Certified"
replace rspo_mill=0 if rspostatus=="Not RSPO Certified"

forvalues i=2017/2021 {
gen y_`i'=1 if  y_1=="`i'" | y_2=="`i'" | y_3=="`i'" | y_4=="`i'" | y_5=="`i'"
}
collapse (sum) y_2017 y_2018 y_2019 y_2020 y_2021 (max) rspo_mill, by( umlid)
rename y_* wsupply_*

save "$PW\data\mills_pw_collapsed.dta", replace

*Gen UML data
import dbase using "$GFW\gfw_universal_mill_list_v20220531.dbf", clear case(lower)
rename uml_id umlid
save "$GFW\gfw_universal_mill_list_v20220531.dta", replace

*Import the concession layer 53 with the merged catchment area based on overlap and nearest distance
import dbase using "$PW\merge catchment area\merge_concessions_catchmentarea_v4.dbf", clear 
*Replace the missing data for overlap with catchment area with data one from neareast neighbor merge
replace uml_id= uml_id_2 if uml_id==""                    
replace rspo_statu=rspo_sta_1 if rspo_statu==""
gen rspo_mill=1 if rspo_statu=="RSPO Certified" 
replace rspo_mill=0 if rspo_statu=="Not RSPO Certified" 
keep deal_id poly_id uml_id rspo_mill
*Drop duplicates from spatial merge process
duplicates drop poly_id uml_id, force
rename uml_id umlid
*Now merge the information on rspo and supply chains. Note that for some of the mills there is no supply chain data and that are are multiple matches 
merge m:m umlid using "$PW\data\mills_pw_collapsed.dta", keep(mas mat) nogen
gen mill_scdata=1 if wsupply_2021!=.
replace mill_scdata=0 if wsupply_2021==.

/*
*Check if all data is in uml 2022 list
*Get uml list from 2022, we override the PW RSPO data using the GFW data on RSPO since it is for 2022, but use the data for 2024 catchment boundaries without matching GFW UML mill
merge m:1 umlid using "$GFW\gfw_universal_mill_list_v20220531.dta", keep(mat mas) keepus(rspo_statu rspo_type date) nogen
*/

keep deal_id poly_id umlid wsupply_* rspo_mill mill_scdata 
sort poly_id deal_id
collapse (mean) deal_id (mean) wsupply_* (max) mill_scdata  (max) rspo_mill , by(poly_id)
egen w_supply1721=rowtotal( wsupply_2017 wsupply_2018 wsupply_2019 wsupply_2020 wsupply_2021)
save "$PW\data\Concessions_mills_pw_collapsed.dta", replace
